{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Dataset and re-process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-07T10:00:39.285937Z",
     "start_time": "2025-05-07T10:00:39.283576Z"
    }
   },
   "outputs": [],
   "source": [
    "from datasets import load_dataset,load_from_disk,Dataset\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-07T10:12:45.124847Z",
     "start_time": "2025-05-07T10:12:44.997877Z"
    }
   },
   "outputs": [],
   "source": [
    "# Import training and test datasets from Alpaca-GPT4 or RetrievalQA datasets (You can download the datasets following the README.md)\n",
    "datasets_name = 'alpaca' # 'alpaca'/'gptrqa'\n",
    "\n",
    "if datasets_name == 'alpaca':\n",
    "    train_dataset_df = load_from_disk('datasets/alpaca-train') \n",
    "    test_dataset_df = load_from_disk('datasets/alpaca-test')   \n",
    "else:\n",
    "    train_dataset_df = load_from_disk('datasets/GPTRQA-train') \n",
    "    test_dataset_df = load_from_disk('datasets/GPTRQA-test')  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check the first 10 entries of the dataset\n",
    "pd.DataFrame(test_dataset_df).head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Main work section: examining whether the split dataset affects model output (desired result: split should not significantly affect results)\n",
    "split_size = 0.2 # ratio between 0.2-0.5\n",
    "train_test_split_t5 = train_dataset_df.train_test_split(test_size=split_size, seed=42) # seed ensures reproducibility\n",
    "train_dataset_t5 = train_test_split_t5['train']\n",
    "val_dataset_t5 = train_test_split_t5['test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize tokenizer\n",
    "from transformers import T5Tokenizer\n",
    "\n",
    "t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')\n",
    "\n",
    "def tokenize_t5_function(examples):\n",
    "    model_inputs = t5_tokenizer(examples['input_text'], padding=\"max_length\", truncation=True)\n",
    "    labels = t5_tokenizer(examples['target_text'], padding=\"max_length\", truncation=True)\n",
    "    print(labels)\n",
    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
    "    return model_inputs\n",
    "\n",
    "tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)\n",
    "tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load model to GPU\n",
    "import torch\n",
    "from transformers import T5ForConditionalGeneration\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "model_path='DI'\n",
    "t5_model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Trainer, TrainingArguments\n",
    "\n",
    "num_epoches = 25\n",
    "\n",
    "training_args_t5 = TrainingArguments(\n",
    "    output_dir= f'split_train_model/{datasets_name}_{split_size}/DI',\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    learning_rate=3e-4,\n",
    "    per_device_train_batch_size=32,\n",
    "    per_device_eval_batch_size=32,\n",
    "    weight_decay=0.01,\n",
    "    save_total_limit=3,\n",
    "    num_train_epochs=num_epoches,\n",
    "    report_to=\"none\"\n",
    ")\n",
    "\n",
    "trainer_t5 = Trainer(\n",
    "    model=t5_model,\n",
    "    args=training_args_t5,\n",
    "    train_dataset=tokenized_train_dataset_t5,\n",
    "    eval_dataset=tokenized_val_dataset_t5\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start training\n",
    "trainer_t5.train()\n",
    "# Save model\n",
    "trainer_t5.save_model(f'DI/{datasets_name}/{datasets_name}_DI_t5_small_{split_size}_{num_epoches}e') \n",
    "t5_tokenizer.save_pretrained(f'DI/{datasets_name}/{datasets_name}_DI_t5_small_{split_size}_{num_epoches}e')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_question(answer):\n",
    "    t5_model.eval()  \n",
    "    input_ids = t5_tokenizer.encode(\"answer: \" + answer, return_tensors=\"pt\").to(device)\n",
    "    outputs = t5_model.generate(input_ids, num_beams=5, early_stopping=True)\n",
    "    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "    return question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adjust global model naming and model prediction result file naming based on current training\n",
    "DI_generation_texts_pth = f'GenText/{datasets_name}/DI_{datasets_name}_gen_{split_size}.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "# Get samples\n",
    "samples = test_dataset_df  \n",
    "res = []\n",
    "# Generate questions and compare\n",
    "with open(DI_generation_texts_pth, 'w') as file:\n",
    "    for example in tqdm(samples):\n",
    "        generated_question = generate_question(example['input_text'])\n",
    "        res.append(generated_question.replace(\"enquiry: \", \"\"))\n",
    "        file.write((generated_question.replace(\"enquiry: \", \"\")+'\\n'))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_metric\n",
    "from nltk.tokenize import word_tokenize\n",
    "from bert_score import score\n",
    "from nltk.translate.meteor_score import meteor_score\n",
    "import evaluate\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "\n",
    "def Calmetic(references:list[list[str]], predictions:list[str]):\n",
    "    '''\n",
    "    Input format:\n",
    "\n",
    "    predictions = [\n",
    "        \"What is the capital of France?\",\n",
    "        \"Who wrote the book?\",\n",
    "        \"What is the largest planet?\"\n",
    "    ]\n",
    "\n",
    "    references = [\n",
    "        [\"What is the capital city of France?\"],\n",
    "        [\"Who is the author of the book?\"],\n",
    "        [\"Which planet is the largest in the solar system?\"]\n",
    "    ]\n",
    "    '''\n",
    "\n",
    "    # # Load BLEU scorer\n",
    "    # bleu_metric = load_metric(\"bleu\")\n",
    "\n",
    "    # # Calculate BLEU scores\n",
    "    predictions_tokenized = [word_tokenize(pred) for pred in predictions]\n",
    "    references_tokenized = [[word_tokenize(refs[0])] for refs in references]\n",
    "    # B_S = {}\n",
    "    # for n in range(1, 5):\n",
    "    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)\n",
    "    #     results = bleu_metric.compute(max_order=n)\n",
    "    #     B_S[f\"BLEU-{n}\"] = results\n",
    "    bleu_metric = evaluate.load(\"bleu\")\n",
    "    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)\n",
    "    for i,n in enumerate(B_S['precisions']):\n",
    "        print(f\"BLEU-{i+1} score: {n:.5f}\")\n",
    "        \n",
    "\n",
    "    # Load ROUGE scorer\n",
    "    rouge_metric = load_metric(\"rouge\")\n",
    "    '''\n",
    "    ROUGE-1: Measures unigram matches between generated and reference text.\n",
    "    ROUGE-2: Measures bigram matches between generated and reference text.\n",
    "    ROUGE-L: Measures the longest common subsequence (LCS) between generated and reference text.\n",
    "    ROUGE-Lsum: A variant of LCS specifically designed for evaluating long texts.\n",
    "    '''\n",
    "    # Calculate ROUGE scores\n",
    "    rouge_results = rouge_metric.compute(predictions=predictions, references=references)\n",
    "    rouge1_mid_f1 = rouge_results['rouge1'][1][2]\n",
    "    rouge2_mid_f1 = rouge_results['rouge2'][1][2]\n",
    "    rougeL_mid_f1 = rouge_results['rougeL'][1][2]\n",
    "    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]\n",
    "    print(f\"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}\")\n",
    "    print(f\"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}\")\n",
    "    print(f\"ROUGE-L F1 score: {rougeL_mid_f1:.5f}\")\n",
    "    print(f\"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}\")\n",
    "\n",
    "    # Calculate METEOR scores\n",
    "    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]\n",
    "    average_meteor_score = sum(meteor_scores) / len(meteor_scores)\n",
    "    print(f\"Average METEOR score: {average_meteor_score:.5f}\")\n",
    "\n",
    "    # Calculate BERTScore\n",
    "    P, R, F1 = score(predictions, [ref[0] for ref in references], lang=\"en\", verbose=False)\n",
    "    average_bert_score = F1.mean().item()\n",
    "    print(f\"Average BERTScore F1: {average_bert_score:.5f}\")\n",
    "\n",
    "    return {\n",
    "        \"BLEU\":B_S,\n",
    "        \"ROUGE\":rouge_results,\n",
    "        \"METERO\":meteor_scores,\n",
    "        \"BERTScore\":{\"Precision\":P,\"Recall\":R,\"F1\":F1},\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('The path you save the DI_generation_text: ', DI_generation_texts_pth)\n",
    "with open(DI_generation_texts_pth, 'r') as file:\n",
    "    content = file.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "refs = [ [i.replace('enquiry: ',\"\")] for i in test_dataset_df['target_text']]\n",
    "res = Calmetic(references=refs,predictions=content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer, BertModel\n",
    "import torch\n",
    "from torch.nn.functional import cosine_similarity as torch_cosine_similarity\n",
    "from sentence_transformers import SentenceTransformer, util\n",
    "\n",
    "sentence_model = SentenceTransformer('all-MiniLM-L6-v2') \n",
    "\n",
    "reference_texts_ = [ i.replace('enquiry: ',\"\") for i in test_dataset_df['target_text'] ]\n",
    "embeddings1 = sentence_model.encode(content, convert_to_tensor=True)\n",
    "embeddings2 = sentence_model.encode(reference_texts_, convert_to_tensor=True)\n",
    "\n",
    "cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)   \n",
    "\n",
    "\n",
    "print(f\"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}\")\n",
    "print(f\"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}\")\n",
    "print(f\"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "img2text",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
